#!/usr/local/bin/perl

if ($ARGV[0]=~/^wos$/i) {
    $inputfile="/projectnb/marxnsf1/dropbox/bigdata/nplmatch/splitcode/wosplpubinfo1955-2017_filteredISS.txt";
    $sourcefilecode="wos";
}
elsif ($ARGV[0]=~/^mag$/i) {
    $inputfile="/projectnb/marxnsf1/dropbox/bigdata/mag/txt/mergedmagfornpl-fixednames.tsv";    
    $sourcefilecode="mag";
}
else {
    $inputfile=$ARGV[0];
    $sourcefilecode="file";

    if (!(-e $inputfile)) {
	die("Usage: buildtitleregex.pl mag|wos|filename_or_fullpath_of_file\n");
    }
}

print "Using source file: $inputfile\n\n";

open(INFILE,"$inputfile");
# space separates patent & ref for the master NPL
# tab separates patent & ref for the yearly slices (and has the year at the end, which could create false positives

$maxpagevolissue=100000;

$outputdir="/projectnb/marxnsf1/dropbox/bigdata/nplmatch/splitcode/year_regex_scripts_" . "$sourcefilecode". "/";;

$inputdir="/projectnb/marxnsf1/dropbox/bigdata/nplmatch/splityear/";

$date=`date`;
print "$date";

chdir("/projectnb/marxnsf1/dropbox/bigdata/nplmatch/splitcode");

$linect=0;
while (<INFILE>) {
    $line=$_;

    $linect++;
    if (($linect % 100000)==0) {
	print "At line $linect\n";
    }

    #print $_;
    chop($line);
    if ($line=~/^([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)/) {
	$year = $1;
	$wosid = $2;
	$vol = $3;
 	$issue = $4;
	$firstpage = $5;
	$firstauthor = $6;
	$title=$7;
	$journal=$8; 

	$year =~ s/\///;
	$wosid =~ s/\///;
	$firstauthor_print=$firstauthor;
	$firstauthor =~ s/\///;
	$firstauthor =~ s/,.*//;
	$vol =~ s/\///;
	$firstpage =~ s/\///;
	$issue =~ s/\///;
	$vol =~ s/\?//;
	$firstpage =~ s/\?//;
	$issue =~ s/\?//;
	$vol =~ s/\(//;
	$firstpage =~ s/\(//;
	$issue =~ s/\(//;
	$vol =~ s/\)//;
	$firstpage =~ s/\)//;
	$issue =~ s/\)//;
	$title=~s/[^a-zA-Z0-9-,'.(): ]//g;
	$journal=~s/"//g;

	$max=0;
	if (($vol>=$max)&&($vol>=$firstpage)&&($vol>=$issue)&&($vol<$maxpagevolissue)) { $max=$vol; }
	elsif (($firstpage>=$max)&&($firstpage>=$vol)&&($firstpage>=$issue)&&($firstpage<$maxpagevolissue)) { $max=$firstpage; }
	elsif (($issue>=$max)&&($issue>=$firstpage)&&($issue>=$vol)&&($issue<$maxpagevolissue)) { $max=$issue; }
	else { next; }

	if (!(($firstauthor eq "")||($firstauthor =~ m/\?/)||($firstauthor =~ m/anonymous/)||($year<1800)||($year>2017)||($vol eq "")||($issue eq "")||($firstpage eq ""))) {
	#if (!(($firstauthor eq "")||($firstauthor eq "[anonymous]")||($year<1800)||($year>2017)||($vol eq "")||($issue eq "")||($firstpage eq ""))) {
         $regex= "\tif (/\\W$firstauthor\\W/ && /\\D$vol\\D+$issue\\D+$firstpage\\D/) { print \"$wosid\t$year\t$vol\t$issue\t$firstpage\t$firstauthor_print\t$title\t$journal\t\$_\"; }\n";	  
# used to use  minmax scheme, but this led to false positives. now search for both volume and firstpage
#       if ($min) {
#	$regex= "\tif (/\\W$firstauthor\\W/ & /\\D$min\\D/) { print \"$wosid\t\t$year\t$vol\t$firstpage\t$firstauthor\t\$_\"; }\n";
#	    }
#	    else {
#		$regex= "\tif (/\\W$firstauthor\\W/) { print \"$wosid\t\t$year\t$vol\t$firstpage\t$firstauthor\t\$_\"; }\n";
#	    }
	    $Output{$year}{$max}.=$regex;
	}
    }

}

print "\n";

foreach $year (sort(keys %Output)) {
    print "Year is $year\n";
    $outputfile="$outputdir"."year"."$year".".pl";
    open(OUTFILE,">$outputfile");
    print OUTFILE "#!/share/pkg/perl/5.24.0/install/bin/perl\n\n";
    foreach $pagevolmax (keys %{ $Output{$year} }) {
	$pagevolfilepath="$inputdir"."$year/"."$pagevolmax";
	# Skip if this file does not exist.  Possibly notate this somewhere.
	if (-e $pagevolfilepath) {
	    print OUTFILE "open(INFILE,\"$pagevolfilepath\");\n";
	    print OUTFILE "while (<INFILE>) {\n";
	    print OUTFILE "$Output{$year}{$pagevolmax}";
	    print OUTFILE "}\n";
	    print OUTFILE "close(INFILE);\n\n";
	}
    }
    close(OUTFILE);
    `chmod 775 $outputfile`;
}

$date=`date`;
print "$date";
